rm(list = ls()) # Removes all variables and functions from global environment
getwd()
[1] "C:/Users/ngduy/Documents/Project/Final_code/code"
setwd("C:/Users/ngduy/Documents/Project/Final_code")
Warning: The working directory was changed to C:/Users/ngduy/Documents/Project/Final_code inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
# BiocManager::install("tximport")
# BiocManager::install("tximportData")
# BiocManager::install("DESeq2")
# install.packages('pheatmap')
# install.packages("DOSE")
# install.packages("enrichplot")
# install.packages("ggupset")
# BiocManager::install("biomaRt")
# BiocManager::install("pathview")
library(biomaRt) # BiomaRt is used to access Ensembl databases for gene annotation
library(tximport) # Import transcript-level quantification data
library(readr) # For reading data files
library(DESeq2)
library(dplyr) # for data manipulation
library(PCAtools) # PCAtools is used for PCA analysis and visualization
library(tidyverse) # includes ggplot2, for data visualisation. dplyr, for data manipulation.
library(pheatmap)
library('org.Hs.eg.db') # Annotation package for human genes
library(GEOquery) # For downloading and processing data from GEO
metadata <- read.csv('../data/muscle/muscle_metadata.csv', header = TRUE, sep = ",", row.names = 1)
metadata.subset <- metadata %>%
rename(sex = 1, age = 2, estradiol =3) %>%
filter(sex == "F")
# Initialize group as NA
metadata.subset$group <- NA
# Assign 'high' and 'low' based on age and estradiol thresholds
metadata.subset$group[metadata.subset$estradiol >= 20] <- 'high'
metadata.subset$group[metadata.subset$estradiol < 20] <- 'low'
metadata.subset$group <- as.factor(metadata.subset$group)
metadata.subset$age <- as.factor(metadata.subset$age)
all_samples = rownames(metadata.subset)
table(metadata.subset$group)
high low
12 24
count_data <- read.csv("../data/muscle/muscle_count_data.csv",header = TRUE, sep = ",", row.names = 1)
colnames(count_data)
[1] "X2C" "X1A" "X3E" "X1C" "X3G" "X3A" "X9G" "X4A" "X2E" "X1G" "X2G" "X2A" "X9C" "X4E" "X7A" "X6A" "X1E" "X5E" "X7E" "X8G" "X5A" "X8E" "X10" "X14" "X1" "X20"
[27] "X21" "X2" "X31" "X36" "X37" "X41" "X46" "X48" "X5" "X7" "X11" "X17" "X18" "X22" "X23" "X24" "X26" "X32" "X35" "X39" "X40" "X42" "X45" "X47" "X4" "X51"
[53] "X52" "X53" "X6" "X13" "X15" "X16" "X19" "X25" "X28" "X29" "X30" "X33" "X34" "X38" "X3" "X43" "X44" "X49" "X50" "X8" "X9"
colnames(count_data) <- gsub("X", "", colnames(count_data))
colnames(count_data)
[1] "2C" "1A" "3E" "1C" "3G" "3A" "9G" "4A" "2E" "1G" "2G" "2A" "9C" "4E" "7A" "6A" "1E" "5E" "7E" "8G" "5A" "8E" "10" "14" "1" "20" "21" "2" "31" "36" "37"
[32] "41" "46" "48" "5" "7" "11" "17" "18" "22" "23" "24" "26" "32" "35" "39" "40" "42" "45" "47" "4" "51" "52" "53" "6" "13" "15" "16" "19" "25" "28" "29"
[63] "30" "33" "34" "38" "3" "43" "44" "49" "50" "8" "9"
all(all_samples %in% colnames(count_data)) # Check if all sample names in metadata are present in count data
[1] TRUE
count_data <- count_data[,all_samples]
dim(count_data)
[1] 13057 36
all(colnames(count_data) %in% rownames(metadata.subset))
[1] TRUE
all(colnames(count_data) == rownames(metadata.subset))
[1] TRUE
dds <- DESeqDataSetFromMatrix(countData = count_data, colData = metadata.subset, design = ~ group)
dds
class: DESeqDataSet
dim: 13057 36
metadata(1): version
assays(1): counts
rownames(13057): ENSG00000000457 ENSG00000000460 ... ENSG00000280209 ENSG00000280358
rowData names(0):
colnames(36): 8 11 ... 9G 3G
colData names(5): sex age estradiol Total.Testosterone...ng.dL. group
While it is not necessary to pre-filter low count genes before running the DESeq2 functions, there are two reasons which make pre-filtering useful: by removing rows in which there are very few reads, we reduce the memory size of the dds data object, and we increase the speed of count modeling within DESeq2. It can also improve visualizations, as features with no information for differential expression are not plotted in dispersion plots or MA-plots.
Here we perform pre-filtering to keep only rows that have a count of at least 10 for a minimal number of samples. The count of 10 is a reasonable choice for bulk RNA-seq.
keep <- rowSums(counts(dds) >= 10) >= 12 # keep rows with at least 10 counts in at least 12 samples
dds <- dds[keep, ]
dds
class: DESeqDataSet
dim: 12983 36
metadata(1): version
assays(1): counts
rownames(12983): ENSG00000000457 ENSG00000000460 ... ENSG00000279413 ENSG00000215414
rowData names(0):
colnames(36): 8 11 ... 9G 3G
colData names(5): sex age estradiol Total.Testosterone...ng.dL. group
#add gene symbol and gene name
library(org.Hs.eg.db)
library(AnnotationDbi)
all_gene_id <- row.names(dds)
annotation <- AnnotationDbi::select(org.Hs.eg.db,
keys = all_gene_id,
columns = c("SYMBOL", "GENENAME", "ENTREZID"),
keytype = "ENSEMBL")
'select()' returned 1:many mapping between keys and columns
annotation <- annotation[!duplicated(annotation$ENSEMBL),] # remove duplicated rows based on ENSEMBL column
If you never tell the DESeq2 functions which level you want to compare against(e.g. which level represents the control group), the comparisons will be based on the alphabetical order of the levels.
dds$group <- relevel(dds$group, ref = "low")
#Differential expression analysis
dds <- DESeq(dds)
estimating size factors
estimating dispersions
gene-wise dispersion estimates
mean-dispersion relationship
final dispersion estimates
fitting model and testing
-- replacing outliers and refitting for 25 genes
-- DESeq argument 'minReplicatesForReplace' = 7
-- original counts are preserved in counts(dds)
estimating dispersions
fitting model and testing
# Save normalized counts
normalized_counts <- counts(dds, normalized = TRUE)
head(normalized_counts)
8 11 13 2 53 49 46 20 36 3 37 18 6 19
ENSG00000000457 213.19506 251.13500 280.26409 234.906743 198.76927 202.76094 209.31549 195.03625 221.30552 246.46363 226.17058 250.11838 177.06882 246.653282
ENSG00000000460 52.55506 48.02582 73.80620 40.983730 49.94203 74.91168 49.60083 36.00669 52.83420 69.13004 54.79904 45.65653 49.29757 56.919988
ENSG00000000938 25.78173 23.01237 21.94238 9.996032 26.96869 45.94583 22.81638 37.00688 24.92179 28.05277 17.93423 34.73866 25.15182 6.990174
ENSG00000000971 239.96839 350.18825 258.32171 691.725388 476.44693 621.26750 442.43938 332.06172 565.22626 328.61817 684.48982 323.56585 367.21660 295.584500
ENSG00000001461 168.57284 211.11349 225.40813 195.922220 166.80637 215.74563 185.50709 191.03551 206.35245 328.61817 193.29116 205.45439 315.90688 177.750139
ENSG00000004455 729.82122 987.53085 714.12488 988.607528 695.19301 712.16034 650.76285 770.14314 834.38163 1008.89785 736.29982 853.57861 999.03036 945.670682
21 29 34 52 43 35 22 40 7 10 1E 1G 2E 2G
ENSG00000000457 290.55498 244.05204 229.93781 216.72793 229.74525 208.13909 216.55979 212.48416 220.81448 251.636331 268.49857 220.36399 307.43204 271.11456
ENSG00000000460 62.68823 60.76398 66.40445 50.93606 62.65780 54.77345 58.88031 43.48513 50.72765 51.924957 74.41533 53.84460 66.09288 65.02748
ENSG00000000938 16.91587 12.94970 27.75111 17.97743 27.84791 14.93821 17.96349 14.82448 63.65823 4.992784 27.15154 14.95683 32.04503 22.00930
ENSG00000000971 307.47085 222.13716 441.04450 356.55241 409.76209 471.05163 405.17639 453.62897 339.17899 427.382341 376.09912 271.21722 297.41797 483.20418
ENSG00000001461 195.03005 277.92049 252.73337 198.75050 244.66377 210.13086 243.50502 276.72355 191.96934 243.647876 221.23478 262.24312 261.36731 241.10188
ENSG00000004455 738.32806 866.63378 801.80900 745.06469 732.99675 790.72938 618.74227 944.81327 691.28856 723.953731 786.38907 901.39842 710.99919 954.40328
4A 4E 5A 6A 9C 8G 9G 3G
ENSG00000000457 252.45182 265.27704 227.68142 230.16490 202.92194 238.69785 239.09166 268.12803
ENSG00000000460 64.85916 49.67735 74.89520 67.04804 57.13336 68.62563 84.67830 72.76337
ENSG00000000938 22.95017 42.72252 10.98463 14.01004 35.46209 35.80468 23.90917 37.87682
ENSG00000000971 356.22649 339.79306 486.31952 389.27890 424.55997 359.04135 363.61856 223.27389
ENSG00000001461 218.52549 231.49644 257.63950 196.14053 216.71275 295.38859 180.31496 278.09561
ENSG00000004455 763.34247 767.01826 626.12390 684.49040 720.07735 678.29973 825.86244 862.19607
write.csv(normalized_counts, file = "../result/muscle/estrogen_normalized_counts.csv",)
DESeq2::plotDispEsts(dds)
# summary the DEGs results
res <- results(dds,alpha=0.05)
summary(res)
out of 12983 with nonzero total read count
adjusted p-value < 0.05
LFC > 0 (up) : 16, 0.12%
LFC < 0 (down) : 32, 0.25%
outliers [1] : 0, 0%
low counts [2] : 0, 0%
(mean count < 7)
[1] see 'cooksCutoff' argument of ?results
[2] see 'independentFiltering' argument of ?results
So, with an adjusted p-value of 0.05, we identify 25 downregulated genes and 13 upregulated genes in the high estradiol group compared to the low estradiol group.
DESeq2::plotMA(res, main="MA-plot", ylim=c(-5,5))
#get results table
res.mod <- as.data.frame(res)
res.mod
#add new column indicating which genes are upregulated or downregulated or not significant
res.mod$diffexpres.modsed <- "NO"
res.mod$diffexpres.modsed[which(res.mod$padj < 0.05 & res.mod$log2FoldChange > 1)] <- "UP"
res.mod$diffexpres.modsed[which(res.mod$padj < 0.05 & res.mod$log2FoldChange < -1)] <- "DOWN"
#add gene symbol and gene name
res.mod$ENSEMBL <- row.names(res.mod)
res.mod <- merge(res.mod, annotation, by = "ENSEMBL", all.x = TRUE)
res.mod
# add a gene symbol column which will be used for labeling the points in the volcano plot (only differentially expres.modsed genes)
res.mod$delabel <- NA
res.mod <- res.mod[order(res.mod$padj),]
res.mod$delabel[1:20] <- res.mod$SYMBOL[1:20]
#Vocalno plot
volcano_plot <- ggplot(data = res.mod, aes(x=log2FoldChange, y=-log10(padj),color = diffexpres.modsed, label = delabel))+
geom_vline(xintercept = c(-1, 1), col = "gray", linetype = 'dashed') +
geom_hline(yintercept = -log10(0.05), col = "gray", linetype = 'dashed') +
geom_point(size = 2) +
scale_color_manual(values = c("#00AFBB", "grey", "#bb0c00"), # to set the colours of our variable<br />
labels = c("Downregulated", "Not significant", "Upregulated")) + # to set the labels in case we want to overwrite the categories from the dataframe
coord_cartesian(xlim = c(-5, 5),ylim = c(0,5))+
ggtitle("Volcano plot of DEGs between High and Low Estradiol level")+
geom_text_repel(max.overlaps = Inf)
# Save the plot to a file
ggsave("../result/muscle/img/estrogen_volcano_plot.png", plot = volcano_plot, width = 6, height = 6, dpi = 300)
# Display the plot in the document
print(volcano_plot)
write.csv(res.mod, file = "../result/muscle/estrogen_DEGs_result.csv", row.names = TRUE)
vst <- vst(dds, blind=FALSE)
head(assay(vst))
8 11 13 2 53 49 46 20 36 3 37 18 6 19 21
ENSG00000000457 8.193626 8.372626 8.495975 8.298865 8.118994 8.140053 8.173949 8.099004 8.233879 8.351791 8.257475 8.368118 7.998490 8.352643 8.537079
ENSG00000000460 6.948367 6.886511 7.201146 6.782854 6.913099 7.212936 6.908411 6.702991 6.952070 7.149967 6.977800 6.852727 6.904228 7.004938 7.075682
ENSG00000000938 6.515825 6.458043 6.434678 6.113644 6.539516 6.856909 6.453810 6.719541 6.498274 6.560642 6.341013 6.681617 6.503002 6.002352 6.315421
ENSG00000000971 8.322293 8.754189 8.404084 9.598461 9.126759 9.460503 9.035620 8.691657 9.340417 8.679458 9.584890 8.661367 8.810516 8.556739 8.602138
ENSG00000001461 7.948255 8.183103 8.253803 8.103775 7.937573 8.206411 8.046616 8.077253 8.158729 8.679458 8.089559 8.154082 8.633493 8.002439 8.098971
ENSG00000004455 9.667869 10.065662 9.639677 10.067113 9.604919 9.636109 9.519879 9.737849 9.842737 10.094188 9.679345 9.872651 10.081085 10.008082 9.682920
29 34 52 43 35 22 40 7 10 1E 1G 2E 2G 4A 4E
ENSG00000000457 8.340911 8.275475 8.211304 8.274560 8.167926 8.210468 8.190041 8.231475 8.374843 8.447454 8.229266 8.601992 8.458389 8.378443 8.433870
ENSG00000000460 7.052556 7.119101 6.926646 7.075320 6.977468 7.029466 6.820820 6.923819 6.939963 7.207656 6.965374 7.115521 7.103198 7.101239 6.909464
ENSG00000000938 6.206790 6.554811 6.342081 6.556686 6.263207 6.341737 6.260094 7.087170 5.913730 6.543113 6.263716 6.634653 6.436158 6.456702 6.809387
ENSG00000000971 8.237941 9.031755 8.775504 8.942181 9.112684 8.928563 9.066256 8.716541 8.993327 8.839032 8.458816 8.563839 9.144195 8.774420 8.718667
ENSG00000001461 8.486443 8.379684 8.118895 8.343679 8.178107 8.338432 8.481549 8.082361 8.339080 8.233533 8.420956 8.417205 8.327486 8.220213 8.282854
ENSG00000004455 9.892644 9.790510 9.694727 9.673506 9.772304 9.455303 10.006878 9.597645 9.657395 9.765108 9.944565 9.633996 10.020286 9.726283 9.732546
5A 6A 9C 8G 9G 3G
ENSG00000000457 8.264722 8.276552 8.140895 8.316449 8.318263 8.445898
ENSG00000000460 7.212761 7.126461 7.007633 7.144315 7.312452 7.189919
ENSG00000000938 6.146274 6.237411 6.693866 6.699616 6.477163 6.733722
ENSG00000000971 9.152162 8.880315 8.985257 8.783753 8.798800 8.243474
ENSG00000001461 8.401128 8.104948 8.211228 8.555979 8.017205 8.487157
ENSG00000004455 9.470450 9.584891 9.650433 9.573177 9.829260 9.885879
library(vsn)
meanSdPlot(assay(vst), ranks = FALSE)
# Order samples by group
sample_ordered <- metadata.subset[order(metadata.subset$group),]
sample_id_ordered <- rownames(sample_ordered)
sample_ordered$group <- as.factor(sample_ordered$group)
# Reorder rlog data
vst_ordered <- vst[, sample_id_ordered]
# Assign colors to each group
group_colors <- as.character(as.numeric(sample_ordered$group)) # 1, 2, etc.
group_colors <- c("skyblue", "orange")[as.numeric(sample_ordered$group)] # custom colors
# Create boxplot
boxplot(assay(vst_ordered),
col = group_colors,
las = 2, # rotate x-axis labels
outline = FALSE, # don't plot outliers
main = "Boxplot of vst-transformed counts",
ylab = "vst-transformed counts")
# Add legend
legend("topright",
legend = levels(sample_ordered$group),
fill = c("skyblue", "orange"),
title = "Group")
p <- pca(assay(vst), metadata = metadata.subset, removeVar = 0.1)
-- removing the lower 10% of variables based on variance
screeplot(p)
pca_pairplot <- pairsplot(p,
components = c(1:4),
triangle = TRUE,
trianglelabSize = 12,
hline = 0, vline = 0,
pointSize = 3,
gridlines.major = FALSE, gridlines.minor = FALSE,
title = 'Pairs plot', plotaxes = FALSE,
margingaps = unit(c(-0.01, -0.01, -0.01, -0.01), 'cm'),
colby = 'group',
lab = p$metadata$estradiol,
labSize = 5,)
Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.Coordinate system already present. Adding new coordinate system, which will replace the existing one.
print(pca_pairplot)
res.df <- data.frame(res)
degs_by_pvalue <- res.df[res.df$pvalue <0.05 & !is.na(res.df$pvalue),] #filter all degs with adjusted p-value < 0.05
degs_by_pvalue
# Add Ensembl as a column to merge
degs_by_pvalue$ensembl_gene_id <- rownames(degs_by_pvalue)
# Merge annotation
annotated_degs_by_pvalue <- merge(degs_by_pvalue, annotation, by.x = "ensembl_gene_id",by.y = 'ENSEMBL', all.x = TRUE)
annotated_degs_by_pvalue
annotated_degs_by_pvalue <- annotated_degs_by_pvalue %>%
arrange(pvalue) %>% # Order by p-value in ascending order
filter(SYMBOL != "") %>% # Filter out rows where the SYMBOL is an empty string
distinct(SYMBOL, .keep_all = TRUE) %>% # Remove duplicate rows based on the SYMBOL column, keeping the first one which is the one with the lowest p-value
filter(!is.na(ensembl_gene_id)) # Filter out rows where the ensembl_gene_id is NA
annotated_degs_by_pvalue
NA
filtered_tx_id <- annotated_degs_by_pvalue$ensembl_gene_id
vst_matrix <- assay(vst)
vst_degs_by_pvalue <- vst_matrix[filtered_tx_id, ]
head(vst_degs_by_pvalue)
8 11 13 2 53 49 46 20 36 3 37 18 6 19
ENSG00000204291 10.504837 10.818403 10.510764 11.201014 11.432070 10.983474 10.183149 10.594279 11.229322 11.201732 10.943267 10.765992 11.452041 10.983873
ENSG00000184811 6.313923 5.734970 6.207270 5.734825 6.113386 8.859385 6.205211 6.390672 6.316211 6.003291 7.519063 6.175446 5.864090 6.113303
ENSG00000181026 8.035022 6.798786 7.123910 6.940691 7.264840 7.570137 7.238828 7.079525 7.016246 6.799647 8.490747 7.153844 6.739437 6.899153
ENSG00000116774 7.472723 7.634142 7.370638 7.956420 8.274504 7.797095 7.085174 7.391546 8.073779 7.946711 7.459681 7.421153 7.817578 7.647485
ENSG00000181092 7.258937 6.458043 7.303626 6.478876 7.381171 11.545848 7.238828 7.798426 7.424927 6.799647 9.650939 7.228893 6.523399 6.499162
ENSG00000111647 8.841290 9.283562 9.195756 9.381275 9.576803 8.840484 8.706506 8.800267 9.243623 9.101699 8.655061 9.339355 9.380740 9.269440
21 29 34 52 43 35 22 40 7 10 1E 1G 2E 2G
ENSG00000204291 10.997339 11.107737 10.770590 10.960402 10.885718 10.590259 10.583763 10.968831 10.965546 11.328429 10.936191 11.163457 11.314361 10.890772
ENSG00000184811 6.315421 5.861928 6.554811 6.702126 6.476329 6.001576 6.991793 6.109806 6.206226 5.804093 6.043829 5.913381 6.178723 6.457986
ENSG00000181026 6.648686 6.683760 6.851780 7.504517 7.509151 7.002979 7.906034 6.972054 7.845749 6.827531 7.107113 7.003875 6.928483 7.235240
ENSG00000116774 7.582559 7.994108 7.437610 7.496085 7.509151 7.652446 7.380430 8.053778 7.605640 7.836615 7.727412 7.768252 7.716256 7.480534
ENSG00000181092 6.364703 6.365196 7.625352 8.108234 7.666087 6.497763 8.520699 6.286733 6.839662 6.002341 6.080802 6.112804 7.523603 8.110002
ENSG00000111647 9.346991 9.191724 8.890163 9.032868 9.097942 9.370018 8.620414 9.424248 9.103315 9.326864 9.319878 9.755699 9.853405 9.493021
4A 4E 5A 6A 9C 8G 9G 3G
ENSG00000204291 10.862655 10.777234 10.949310 10.746488 10.748190 10.507918 10.687211 11.076330
ENSG00000184811 6.912408 6.867687 6.827561 6.265187 6.788843 6.176201 6.340955 5.862065
ENSG00000181026 8.205345 7.612327 7.537577 7.214443 7.790267 7.589981 7.502257 6.765510
ENSG00000116774 7.536883 7.465860 7.655000 7.286538 7.634763 6.976537 7.476813 7.847833
ENSG00000181092 8.978138 9.020936 8.772010 6.900595 8.196456 6.665769 7.088342 5.862065
ENSG00000111647 9.409104 9.369224 8.855989 8.970096 9.175344 8.851148 8.874898 9.416211
vst_degs_by_pvalue <- cbind(gene_symbol = annotated_degs_by_pvalue$SYMBOL, vst_degs_by_pvalue)
write.csv(vst_degs_by_pvalue, file = "../result/muscle/estrogen_vst_DEGs_expression_by_pvalue.csv", row.names = TRUE)
write.csv(metadata.subset, file = "../result/muscle/estrogen_clinical_data.csv")
df_res <- as.data.frame(res)
df_res_ordered_by_pvalue <- df_res[order(df_res$pvalue),]
df_res_ordered_by_pvalue$ENSEMBL <- rownames(df_res_ordered_by_pvalue)
annotated_df_res_ordered_by_pvalue <- merge(df_res_ordered_by_pvalue,annotation,by='ENSEMBL',all.x = TRUE)
# remove duplicated and NA values in SYMBOL column
annotated_df_res_ordered_by_pvalue <- annotated_df_res_ordered_by_pvalue %>%
filter(!is.na(SYMBOL)) %>%
distinct(SYMBOL, .keep_all = TRUE) %>%
filter(!is.na(ENSEMBL))
all_gene_id <- annotated_df_res_ordered_by_pvalue$ENSEMBL
all_vst_matrix <- vst_matrix[all_gene_id,]
all_rld_matrix <- cbind(gene_symbol = annotated_df_res_ordered_by_pvalue$SYMBOL,all_vst_matrix)
write.csv(all_rld_matrix, file = "../result/muscle/estrogen_vst_all_genes.csv",row.names = TRUE)